{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import pandas as pd \n",
    "import os\n",
    "import shutil\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm\n",
    "save_dir = Path(\"opendata\")\n",
    "if save_dir.exists:\n",
    "    shutil.rmtree(save_dir, ignore_errors=True)\n",
    "\n",
    "os.makedirs(name=save_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:   0%|          | 0/6 [00:00<?, ?it/s]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--generated_chat_0.4M-b65d3913b01e68e0/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 39.98it/s]\n",
      "processsssssssssss:  17%|█▋        | 1/6 [00:09<00:48,  9.77s/it]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--school_math_0.25M-01ed2660a3b251c0/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 142.90it/s]\n",
      "processsssssssssss:  33%|███▎      | 2/6 [00:20<00:40, 10.07s/it]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_2M_CN-9f5684b36fb958f4/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 10.75it/s]\n",
      "processsssssssssss:  50%|█████     | 3/6 [00:32<00:33, 11.12s/it]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_1M_CN-c99dfb2fac2ab434/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 43.48it/s]\n",
      "processsssssssssss:  67%|██████▋   | 4/6 [00:39<00:18,  9.40s/it]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_0.5M_CN-30591af9a26b6c1b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 66.67it/s]\n",
      "processsssssssssss:  83%|████████▎ | 5/6 [00:44<00:07,  7.98s/it]Found cached dataset json (C:/Users/yuanz/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--multiturn_chat_0.8M-9e895b626c42bf3b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
      "100%|██████████| 1/1 [00:00<00:00, 21.03it/s]\n",
      "processsssssssssss: 100%|██████████| 6/6 [00:57<00:00,  9.55s/it]\n"
     ]
    }
   ],
   "source": [
    "belle_list = ['BelleGroup/generated_chat_0.4M', 'BelleGroup/school_math_0.25M', 'BelleGroup/train_2M_CN', 'BelleGroup/train_1M_CN',\n",
    "              'BelleGroup/train_0.5M_CN', 'BelleGroup/multiturn_chat_0.8M']\n",
    "\n",
    "datasets_class = \"BelleGroup\"\n",
    "for index, temp_data_name in tqdm(enumerate(belle_list), total=len(belle_list), desc=\"processsssssssssss\"):\n",
    "    data1 = load_dataset(path=temp_data_name)\n",
    "    data1['train'].to_pandas().head(2000).pipe(\n",
    "        lambda x: x.assign(**{\n",
    "            'q': x.apply(lambda j: f'{j[\"instruction\"]}\\n{j[\"input\"]}', axis=1),\n",
    "            'a': x['output']\n",
    "        })\n",
    "    )[['q', 'a']].to_json(save_dir.joinpath(f\"data_{index}_{datasets_class}.json\"), force_ascii=False, orient='records')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
